library(tidyverse)
library(arrow)
library(magrittr)
library(fs)
library(ggbeeswarm)
library(UpSetR)
library(arrow)
library(pheatmap)
library(viridis)
library(paletteer)
library(diann)
library(biomaRt)
library(clusterProfiler)
library(org.Hs.eg.db)
set.seed(0)
setwd("~/Documents/EMBL_UniHD/mini-benchmark")
org_cols <- c(
Ecoli = "#007B53",
Hsapiens = "#193F90"
)
## Overwrites all functions
select <- dplyr::select
# Auxiliary functions to convert units
## Function to convert units to MB
get_MB <- function(x) {
## Trim whitespace
x %<>% as.character() %>%
trimws()
## Gets numeric value
val <- str_extract(x, "[0-9]+(\\.[0-9]+)?") %>%
as.numeric()
## Gets unit
unit <- str_extract(x, "[A-Za-z]+") %>%
toupper()
## Gets MB
value_mb = case_when(
unit %in% c("KB", "K") ~ val / 1024,
unit %in% c("MB", "M") ~ val,
unit %in% c("GB", "G") ~ val * 1024,
unit %in% c("TB", "T") ~ val * 1024 * 1024,
TRUE ~ NA_real_
)
## Return megabytes
return(value_mb)
}
## Function to get time information into seconds
get_seconds <- function(x) {
x %<>% as.character()
## Gets time with regex
h <- x %>%
str_extract(., "\\d+(?=h)") %>%
as.numeric()
m <- x %>%
str_extract(., "\\d+(?=m)") %>%
as.numeric()
## Regex to allow decimal seconds
s <- x %>%
str_extract(., "\\d+(?:\\.\\d+)?(?=s)") %>%
as.numeric()
# Replace NAa with 0
h[is.na(h)] <- 0
m[is.na(m)] <- 0
s[is.na(s)] <- 0
## Return seconds
return(h * 3600 + m * 60 + s)
}
Gets metadata from server after running
ls -lh */*/data/*raw | tr -s ' ' '\t' > files_list.tsv
and then secure copying it to the local root.
raw_info <- read_tsv("./files_list.tsv",
col_names = FALSE)
head(raw_info)
## # A tibble: 6 × 9
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <chr> <dbl> <chr> <chr> <chr> <chr> <dbl> <time> <chr>
## 1 -rwxrwxr-x 1 efra efra 7.2G Apr 11 15:23 Astral/Ecoli/data/24100…
## 2 -rwxrwxr-x 1 efra efra 7.2G Apr 11 15:24 Astral/Ecoli/data/24100…
## 3 -rwxrwxr-x 1 efra efra 7.2G Apr 11 15:26 Astral/Ecoli/data/24100…
## 4 -rwxrwxr-x 1 efra efra 12G Apr 21 19:16 Astral/Hsapiens/data/24…
## 5 -rwxrwxr-x 1 efra efra 12G Apr 21 19:18 Astral/Hsapiens/data/24…
## 6 -rwxrwxr-x 1 efra efra 12G Apr 21 19:20 Astral/Hsapiens/data/24…
## Keeps just the necessary information
raw_info %<>% dplyr::select(c("X5", "X9")) %>%
purrr::set_names(c("size", "fullname"))
## Mutates the tibble
raw_info %<>% mutate(size_MB = get_MB(size)) %>%
separate(
fullname,
into = c("instrument", "organism", "subfolder", "file_name"),
sep = "/"
)
head(raw_info)
## # A tibble: 6 × 6
## size instrument organism subfolder file_name size_MB
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 7.2G Astral Ecoli data 241002_Astral_P3539_JS_30min_2mz_… 7373.
## 2 7.2G Astral Ecoli data 241002_Astral_P3539_JS_30min_2mz_… 7373.
## 3 7.2G Astral Ecoli data 241002_Astral_P3539_JS_30min_2mz_… 7373.
## 4 12G Astral Hsapiens data 241101_Astral_P3585_JS_DIA_60min_… 12288
## 5 12G Astral Hsapiens data 241101_Astral_P3585_JS_DIA_60min_… 12288
## 6 12G Astral Hsapiens data 241101_Astral_P3585_JS_DIA_60min_… 12288
ggplot(raw_info, aes(x = instrument, y = size_MB / 1024, colour = organism)) +
geom_beeswarm(size = 2, alpha = 0.7) +
scale_x_discrete(limits = c("FusionLumos", "Exploris480", "Astral")) +
scale_colour_manual(
name = "Organism",
values = org_cols
) +
theme_minimal() +
ylim(0, 12.5) +
labs(
x = "Mass Spectrometer",
y = "Raw file size (GB)"
) +
ggtitle("Size of raw files per experiment")
## Collapse into average size of raw files
raw_summary <- raw_info %>%
group_by(instrument, organism) %>%
summarise(
avg_raw_size = mean(size_MB, na.rm = TRUE),
## Cleans unnecessary metadata
.groups = "drop"
) %>%
## Converts back to GB
mutate(avg_raw_size = avg_raw_size / 1024)
fasta_info <- read_tsv("./fasta_list.tsv",
col_names = FALSE)
## Rows: 2 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): X1, X3, X4, X5, X6, X9
## dbl (2): X2, X7
## time (1): X8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fasta_info
## # A tibble: 2 × 9
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <chr> <dbl> <chr> <chr> <chr> <chr> <dbl> <time> <chr>
## 1 -rw-rw-r-- 1 efra efra 2.0M Apr 15 15:49 fastas/Ecoli_UP00000062…
## 2 -rw-rw-r-- 1 efra efra 14M Apr 15 15:49 fastas/Hsapiens_UP00000…
## Filter out unnecessary columns
fasta_info %<>% dplyr::select(c("X5", "X9")) %>%
purrr::set_names(c("size", "fullname")) %>%
mutate(size_MB = get_MB(size)) %>%
## Get organism name with regex
mutate(organism = str_extract(fullname, "(?<=/)[^/_]+(?=_)"))
Get times after doing
grep -m1 'duration: <strong>' */*/*/nf_report.html > runtimes.txt
runtimes <- read_delim("runtimes.txt",
delim = " ", ## By some reason grep created 10 whitespaces
col_names = FALSE) %>%
set_names(c("run", "string")) %>%
mutate(time = get_seconds(string) / 60,
instrument = word(run, 1, sep = "/"),
organism = word(run, 2, sep = "/"),
profile = word(run, 3, sep = "/")) %>%
select(!run) %>%
select(!string)
runtimes %>%
filter(!str_detect(profile, "1\\.9\\.1")) %>%
ggplot() +
geom_beeswarm(aes(y = time / 60, x = instrument, colour = organism),
size = 2,
alpha = 0.8,
) +
scale_colour_manual(
name = "Organism",
values = org_cols
) +
geom_boxplot(aes(y = time / 60, x = instrument),
width = .25, alpha = 0,
outlier.shape = NA) +
scale_x_discrete(limits = c("FusionLumos", "Exploris480", "Astral")) +
theme_minimal() +
labs(
x = "Mass Spectrometer",
y = "Total runtime (h)",
title = "Runtime per experiment"
)
runtimes %>%
filter(!str_detect(profile, "1\\.9\\.1")) %>%
ggplot() +
geom_beeswarm(aes(y = time / 60, x = organism, colour = instrument),
size = 2,
alpha = 0.9,
) +
geom_boxplot(aes(y = time / 60, x = organism),
width = .25, alpha = 0,
outlier.shape = NA) +
scale_x_discrete(limits = c("Ecoli", "Hsapiens")) +
theme_minimal() +
labs(
x = "Organism",
y = "Total runtime (h)"
)
size_time <- left_join(runtimes, raw_summary,
by = c("instrument", "organism"))
size_time %>%
filter(!str_detect(profile, "1\\.9\\.1")) %>%
ggplot(aes(x = avg_raw_size, y = time /60 ,
colour = organism,
shape = instrument)) +
geom_point(size = 3, alpha = 0.8) +
scale_colour_manual(
name = "Organism",
values = org_cols
) +
theme_minimal() +
labs(
x = "Average raw file size (GB)",
y = "Total runtime (h)",
title = "Runtime vs file size"
)
size_time %>%
filter(!str_detect(profile, "1\\.9\\.1")) %>%
ggplot(aes(x = avg_raw_size, y = time /60 ,
colour = instrument,
shape = organism)) +
geom_point(size = 2, alpha = 0.8) +
theme_minimal() +
labs(
x = "Average raw file size (GB)",
y = "Total runtime (h)",
title = "Runtime vs file size"
)
size_time %>%
filter(!str_detect(profile, "1\\.9\\.1")) %>%
filter(instrument == "Astral") %>%
ggplot(aes(x = avg_raw_size, y = time /60 ,
colour = organism)) +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(
name = "Organism",
values = org_cols
) +
theme_minimal() +
labs(
x = "Average raw file size (GB)",
y = "Total runtime (h)"
) +
ggtitle("Astral runs")
size_time %>%
filter(!str_detect(profile, "1\\.9\\.1")) %>%
filter(instrument == "Exploris480") %>%
ggplot(aes(x = avg_raw_size, y = time /60 ,
colour = organism)) +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(
name = "Organism",
values = org_cols
) +
theme_minimal() +
labs(
x = "Average raw file size (GB)",
y = "Total runtime (h)"
) +
ggtitle("Exploris480 runs")
size_time %>%
filter(!str_detect(profile, "1\\.9\\.1")) %>%
filter(instrument == "FusionLumos") %>%
ggplot(aes(x = avg_raw_size, y = time /60 ,
colour = organism)) +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(
name = "Organism",
values = org_cols
) +
theme_minimal() +
labs(
x = "Average raw file size (GB)",
y = "Total runtime (h)"
) +
ggtitle("FusionLumos runs")
deconv_rt <- function(x, ms, org) {
x %>%
filter(
!str_detect(profile, "1\\.9\\.1"),
instrument == ms,
organism == org
) %>%
mutate(
convert_tool = word(profile, 1, sep = "_"),
diann_version = word(profile, 2, sep = "_"),
) %>%
mutate(
diann_version = replace_na(diann_version, "diannv2.1")
) %>%
ggplot(aes(x = convert_tool, y = time / 60, shape = diann_version)) +
geom_beeswarm(size = 3, color = "#18974C") +
theme_minimal() +
labs(
x = "Raw conversion tool",
y = "Total runtime (h)"
) +
ggtitle(paste("Pipeline runtimes for", org, "measured in", ms))
}
deconv_rt(size_time, "Astral", "Hsapiens") +
ylim(0, NA)
deconv_rt(size_time, "Astral", "Ecoli") +
ylim(0, NA)
## Gets names of the traces TSVs
traces_files <- dir_ls(
path = ".",
recurse = TRUE,
glob = "*nf_trace.tsv"
)
## Reads all the trace files
traces_list <- traces_files %>%
purrr::set_names() %>%
map(read_tsv)
## Using map because it is a list of tibbles
traces_list %<>% map(
~ mutate(
.x,
### Convert time metrics to seconds
duration_seconds = get_seconds(duration),
realtime_seconds = get_seconds(realtime),
### Convert memory metrics to MB
peak_rss_MB = get_MB(peak_rss),
peak_vmem_MB = get_MB(peak_vmem),
rchar_MB = get_MB(rchar),
wchar_MB = get_MB(wchar),
### Get CPU percentage
cpu_percentage = .x %>%
pluck("%cpu") %>%
str_remove("%") %>%
as.numeric()
)
)
## Adds metadata from the tibble name as column
## Use imap() instead if map2() for simplicity
traces_list %<>% imap(~ .x %>%
mutate(
instrument = word(.y, 1, sep = "/"),
organism = word(.y, 2, sep = "/"),
profile = word(.y, 3, sep = "/")
))
traces_list[1] %>% str()
## List of 1
## $ Astral/Ecoli/diannv2.1/nf_trace.tsv: tibble [4 × 24] (S3: tbl_df/tbl/data.frame)
## ..$ task_id : num [1:4] 1 2 3 4
## ..$ hash : chr [1:4] "fe/803e00" "48/d239c6" "a6/987de5" "ee/aff423"
## ..$ native_id : num [1:4] 2913659 2915213 2924176 2924182
## ..$ name : chr [1:4] "diann_2_1_workflow:fasta_to_speclib" "diann_2_1_workflow:diann_search" "diann_2_1_workflow:copy_files" "diann_2_1_workflow:diann_stats (1)"
## ..$ status : chr [1:4] "COMPLETED" "COMPLETED" "COMPLETED" "COMPLETED"
## ..$ exit : num [1:4] 0 0 0 0
## ..$ submit : POSIXct[1:4], format: "2025-04-20 19:27:11" "2025-04-20 19:28:00" ...
## ..$ duration : chr [1:4] "48.8s" "8m 41s" "140ms" "9.1s"
## ..$ realtime : chr [1:4] "48.5s" "8m 40s" "28ms" "8.7s"
## ..$ %cpu : chr [1:4] "2366.8%" "1409.0%" "104.9%" "327.3%"
## ..$ peak_rss : chr [1:4] "5.7 GB" "18.4 GB" "0" "614.8 MB"
## ..$ peak_vmem : chr [1:4] "9.9 GB" "1.5 TB" "0" "7.9 GB"
## ..$ rchar : chr [1:4] "177.6 MB" "421.2 MB" "25.4 MB" "155.6 MB"
## ..$ wchar : chr [1:4] "175.2 MB" "70 MB" "25.3 MB" "421.2 KB"
## ..$ duration_seconds: num [1:4] 48.8 521 8400 9.1
## ..$ realtime_seconds: num [1:4] 48.5 520 1680 8.7
## ..$ peak_rss_MB : num [1:4] 5837 18842 NA 615
## ..$ peak_vmem_MB : num [1:4] 10138 1572864 NA 8090
## ..$ rchar_MB : num [1:4] 177.6 421.2 25.4 155.6
## ..$ wchar_MB : num [1:4] 175.2 70 25.3 0.411
## ..$ cpu_percentage : num [1:4] 2367 1409 105 327
## ..$ instrument : chr [1:4] "Astral" "Astral" "Astral" "Astral"
## ..$ organism : chr [1:4] "Ecoli" "Ecoli" "Ecoli" "Ecoli"
## ..$ profile : chr [1:4] "diannv2.1" "diannv2.1" "diannv2.1" "diannv2.1"
summarize everything
## Summarize total times per <instrument/organism/profile>
traces_times <- imap_dfr(traces_list, ~ {
# split the list‐name (.y) into components
comb <- str_split(.y, "/", simplify = TRUE)
tibble(
instrument = comb[1],
organism = comb[2],
profile = comb[3],
## Also convert to minutes
total_duration_min = sum(.x$duration_seconds, na.rm = TRUE) %>% { . / 60 },
total_realtime_min = sum(.x$realtime_seconds, na.rm = TRUE) %>% { . / 60 }
)
})
## Standard QC thresholds on DIA-NN reports
## Extracts high quality peptides
qc <- function(x) {
x %>%
collect() %>%
filter(
## Standard thresholds
Q.Value <= 0.01,
Lib.Q.Value <= 0.01,
Lib.PG.Q.Value <= 0.01,
PG.Q.Value <= 0.05,
### Filters out contaminants
!str_detect(Protein.Group, "contam_sp"),
!str_detect(Protein.Ids, "contam_sp")
) %>%
pull(Precursor.Id) %>%
unique()
}
## Function to perform qc on the report's precursor
## and filter them out from the matrices
clean <- function(report, mat) {
out <- mat %>%
filter(Precursor.Id %in% qc(report)) %>%
## Collapses to precursor level and removes missing values
mutate(
## Converts NAs to 0s
across(c(ends_with("mzML"), ends_with("raw")),
~ replace_na(.x, 0))
) %>%
group_by(Precursor.Id) %>%
mutate(
## Sum intensities per precursors
across(c(ends_with("mzML"), ends_with("raw")),
sum),
) %>%
ungroup() %>%
filter(
## Filters our zeros
!if_any(c(ends_with("mzML"), ends_with("raw")),
~. == 0)
)
}
## Lists parquets
parquet_list <- dir_ls(
path = ".",
recurse = TRUE,
glob = "*report.parquet"
) %>%
## Reads all the parquet files
purrr::set_names() %>%
map(open_dataset, format = "parquet")
## Lists DIA-NN v1.8 reports
diann_list <- dir_ls(
path = ".",
recurse = TRUE,
glob = "*diann-report"
) %>%
## Read DIA-NN v1.8 reports
purrr::set_names() %>%
map(diann_load)
## Concatenates all
all_reports <- c(
parquet_list,
diann_list
)
## Lists matrices
pr_matrixes <- dir_ls(
path = ".",
recurse = TRUE,
regexp = "*pr_matrix.tsv"
) %>%
## Exclude firts passed
str_subset("first-pass", negate = TRUE) %>%
purrr::set_names() %>%
map(read_tsv)
# Performs QC
## Gets <instrument/organism/profile> keys
keys <- all_reports %>%
names() %>%
str_replace("^((?:[^/]+/){2}[^/]+).*", "\\1")
keyed_reports <- all_reports %>%
set_names(
sub("^((?:[^/]+/){2}[^/]+).*", "\\1", names(.) )
)
keyed_matrices <- pr_matrixes %>%
set_names(
sub("^((?:[^/]+/){2}[^/]+).*", "\\1", names(.) )
)
pr_matrixes_cleaned <- imap(
keyed_reports,
~ if(.y %in% names(keyed_matrices))
clean(.x, keyed_matrices[[.y]])
)
# Functions to extract information from DIA-NN outputs
## Get protein groups from parquet DIA-NN output
pg <- function(x) {
x %>%
collect() %>%
pull(Protein.Group) %>%
unique()
}
## Get proteins from parquet DIA-NN output
proteins <- function(x) {
x %>%
collect() %>%
pull(Protein.Ids) %>%
unique()
}
## Get peptides from parquet DIA-NN output
peptides <- function(x) {
x %>%
collect() %>%
pull(Precursor.Id) %>%
unique()
}
pg_list <- c(
pr_matrixes_cleaned %>% map(pg)
)
proteins_list <- c(
pr_matrixes_cleaned %>% map(proteins)
)
peptides_list <- c(
pr_matrixes_cleaned %>% map(peptides)
)
# Auxiliary functions to get Jaccard indexes
## Computes Jaccard Index
jaccard <- function(x, y) {
length(intersect(x, y)) / length(union(x, y))
}
## Commputes jaccard index pairwise of list
jpw <- function(x) {
n <- names(x)
## Expands pairwise
expand_grid(
names1 = n,
names2 = n,
) %>%
## Maps the function fer columns
mutate(
index = map2_dbl(x[names1], x[names2], jaccard)
) %>%
## Comes back to squared datafra,e
pivot_wider(
names_from = names2,
values_from = index
) %>%
column_to_rownames("names1")
}
## Function to automatize heatmap plotting
jaccard_hm <- function(x, string) {
x %>%
jpw() %>%
pheatmap(
main = string,
cluster_rows = TRUE,
cluster_cols = TRUE,
border_color = NA,
color = viridis(100),
angle_col = 45,
fontsize_row = 10,
fontsize_col = 10,
cellwidth = 30,
cellheight = 30
)
}
## Function to automatize upset plotting
jaccard_us <- function(x) {
x %>%
fromList() %>%
upset(nsets = ncol(.),
order.by = "freq",
decreasing = TRUE,
nintersects = 10)
}
pg_hsapiens <- pg_list %>%
{ .[str_detect(names(.), "Hsapiens")] }
pg_hsapiens_astral <- pg_hsapiens %>%
{ .[str_detect(names(.), "Astral")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pg_hsapiens_astral, "Astral H sapiens | Pairwise Jaccard Index of identified protein groups")
jaccard_us(pg_hsapiens_astral)
pg_hsapiens_exploris <- pg_hsapiens %>%
{ .[str_detect(names(.), "Exploris480")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pg_hsapiens_exploris, "Exploris480 H sapiens | Pairwise Jaccard Index")
jaccard_us(pg_hsapiens_exploris)
pg_hsapiens_lumos <- pg_hsapiens %>%
{ .[str_detect(names(.), "FusionLumos")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pg_hsapiens_lumos, "FusionLumos H sapiens | Pairwise Jaccard Index")
jaccard_us(pg_hsapiens_lumos)
pg_ecoli <- pg_list %>%
{ .[str_detect(names(.), "Ecoli")] }
pg_ecoli_astral <- pg_ecoli %>%
{ .[str_detect(names(.), "Astral")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pg_ecoli_astral, "Astral E coli | Pairwise Jaccard Index")
jaccard_us(pg_ecoli_astral)
pg_ecoli_exploris <- pg_ecoli %>%
{ .[str_detect(names(.), "Exploris480")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pg_ecoli_exploris, "Exploris480 E coli | Pairwise Jaccard Index")
jaccard_us(pg_ecoli_exploris)
pg_ecoli_lumos <- pg_ecoli %>%
{ .[str_detect(names(.), "FusionLumos")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pg_ecoli_lumos, "FusionLumos E coli | Pairwise Jaccard Index")
jaccard_us(pg_ecoli_lumos)
prots_hsapiens <- proteins_list %>%
{ .[str_detect(names(.), "Hsapiens")] }
prots_hsapiens_astral <- prots_hsapiens %>%
{ .[str_detect(names(.), "Astral")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(prots_hsapiens_astral, "Astral H sapiens | Pairwise Jaccard Index of identified proteins")
jaccard_us(prots_hsapiens_astral)
prots_hsapiens_exploris <- prots_hsapiens %>%
{ .[str_detect(names(.), "Exploris480")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(prots_hsapiens_exploris, "Exploris480 H sapiens | Pairwise Jaccard Index")
jaccard_us(prots_hsapiens_exploris)
prots_hsapiens_lumos <- prots_hsapiens %>%
{ .[str_detect(names(.), "FusionLumos")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(prots_hsapiens_lumos, "FusionLumos H sapiens | Pairwise Jaccard Index")
jaccard_us(prots_hsapiens_lumos)
prots_ecoli <- pg_list %>%
{ .[str_detect(names(.), "Ecoli")] }
prots_ecoli_astral <- prots_ecoli %>%
{ .[str_detect(names(.), "Astral")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(prots_ecoli_astral, "Astral E coli | Pairwise Jaccard Index")
jaccard_us(prots_ecoli_astral)
prots_ecoli_exploris <- prots_ecoli %>%
{ .[str_detect(names(.), "Exploris480")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(prots_ecoli_exploris, "Exploris480 E coli | Pairwise Jaccard Index")
jaccard_us(prots_ecoli_exploris)
prots_ecoli_lumos <- prots_ecoli %>%
{ .[str_detect(names(.), "FusionLumos")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(prots_ecoli_lumos, "FusionLumos E coli | Pairwise Jaccard Index")
jaccard_us(prots_ecoli_lumos)
pepts_hsapiens <- peptides_list %>%
{ .[str_detect(names(.), "Hsapiens")] }
pepts_hsapiens_astral <- pepts_hsapiens %>%
{ .[str_detect(names(.), "Astral")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pepts_hsapiens_astral, "Astral H sapiens | Pairwise Jaccard Index of identified precursors")
jaccard_us(pepts_hsapiens_astral)
pepts_hsapiens_exploris <- pepts_hsapiens %>%
{ .[str_detect(names(.), "Exploris480")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pepts_hsapiens_exploris, "Exploris480 H sapiens | Pairwise Jaccard Index")
jaccard_us(pepts_hsapiens_exploris)
pepts_hsapiens_lumos <- pepts_hsapiens %>%
{ .[str_detect(names(.), "FusionLumos")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pepts_hsapiens_lumos, "FusionLumos H sapiens | Pairwise Jaccard Index")
jaccard_us(pepts_hsapiens_lumos)
pepts_ecoli <- peptides_list %>%
{ .[str_detect(names(.), "Ecoli")] }
pepts_ecoli_astral <- pepts_ecoli %>%
{ .[str_detect(names(.), "Astral")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pepts_ecoli_astral, "Astral E coli | Pairwise Jaccard Index")
jaccard_us(pepts_ecoli_astral)
pepts_ecoli_exploris <- pepts_ecoli %>%
{ .[str_detect(names(.), "Exploris480")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pepts_ecoli_exploris, "Exploris480 E coli | Pairwise Jaccard Index")
jaccard_us(pepts_ecoli_exploris)
pepts_ecoli_lumos <- pepts_ecoli %>%
{ .[str_detect(names(.), "FusionLumos")] } %>%
set_names( word(names(.), 3, sep = "/") )
jaccard_hm(pepts_ecoli_lumos, "FusionLumos E coli | Pairwise Jaccard Index")
jaccard_us(pepts_ecoli_lumos)
## Function to compute cross validations
compute_cv <- function(x) {
x %>%
rowwise() %>%
mutate(
cv = {
c_across(c(ends_with("mzML"), ends_with("raw"))) %>%
log() %>%
{ sd(.)/mean(.) }
}
) %>%
ungroup()
}
##
#pr_matrixes_cleaned %<>% lapply(compute_cv)
## Gets the common peptides detected across all profiles
common_peptds <- purrr::reduce(pepts_hsapiens_astral, intersect)
## Gets data just from Astral and Human
prs_hsapiens_astral <- pr_matrixes_cleaned %>%
{ .[str_detect(names(.), "Hsapiens")] } %>%
{ .[str_detect(names(.), "Astral")] } %>%
## Just common peptides
lapply(filter, Precursor.Id %in% common_peptds) %>%
## Compute cvs
lapply(compute_cv)
## Compute mean
#lapply(compute_means)
## Extracts cvs
cvs <- prs_hsapiens_astral %>%
## Creates tibble with sources and all cv's
imap_dfr(
~ tibble(
profile = .y,
cv = .x$cv
)
) %>%
mutate(
## Deletes unnecessary info
profile = word(profile, 3, sep = "/")
)
## Violin and boxplot
ggplot(cvs, aes(x = profile, y = cv)) +
geom_violin(color = "gray",
fill = "#6CC24A",
alpha = 0.5) +
geom_boxplot(fill = "#18974C",
width = .25,
outlier.shape = NA) +
theme_minimal() +
scale_y_sqrt() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
labs(
x = "Workflow",
y = "Coefficient of Variation",
title = "CV's across different workflows | Astral, H sapiens"
)
## Just boxplot
ggplot(cvs, aes(x = profile, y = cv)) +
geom_boxplot(fill = "#18974C",
width = .5,
outlier.shape = NA) +
ylim(0, 0.035) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
labs(
x = "Workflow",
y = "Coefficient of Variation",
title = "CV's across different workflows | Astral, H sapiens"
)
## Warning: Removed 35422 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
### All with peptides identified with newer DIA-NN versions
list_pepts_no_diann1_8 <- pepts_hsapiens_astral[c(
"diannv2.1",
"msconvert_diannv1.9.2",
"msconvert_diannv2.0",
"msconvert_diannv2.0.1",
"msconvert_diannv2.0.2",
"thermoraw_diannv1.9.2",
"thermoraw_diannv2.0",
"thermoraw_diannv2.0.1",
"thermoraw_diannv2.0.2"
)]
## Intersection off all sets
pepts_no_diann1_8 <- Reduce(intersect, list_pepts_no_diann1_8)
## Intersection of set of dia-nn 1.8
pepts_diann1_8 <- c(
pepts_hsapiens_astral$msconvert_diannv1.8.1,
pepts_hsapiens_astral$thermoraw_diannv1.8.1
)
## Proteins identified by all versions but DIA-NN 1.8
pepts_unid <- setdiff(pepts_no_diann1_8, pepts_diann1_8)
## Flags if the precursor was identified by DIA-NN 1.8 or not in the precrusos matrix
all_prs_mat <- pr_matrixes_cleaned %>%
{ .[str_detect(names(.), "Hsapiens")] } %>%
{ .[str_detect(names(.), "Astral")] } %>%
map(
~.x %>%
mutate(is_unid = Precursor.Id %in% pepts_unid)
)
## Pivots everythin into a single object
all_long <- map_dfr(
all_prs_mat,
~ .x %>%
pivot_longer(
cols = c(ends_with("mzML"), ends_with("raw")),
names_to = "sample",
values_to = "intensity"
),
.id = "tibble_id"
)
## Plots all densities of all samples together
ggplot(all_long, aes(x = log(intensity),
color = !(is_unid),
group = interaction(sample, !(is_unid)))) +
geom_density(alpha = 0.7) +
theme_minimal() +
labs(
title = "Intensity densities of all Astral H. sapiens runs",
x = "log(Intensity)",
y = "Density",
color = "Identified by DIA-NN 1.8"
)
### All with proteins identified with newer DIA-NN versions
list_prots_no_diann1_8 <- prots_hsapiens_astral[c(
"diannv2.1",
"msconvert_diannv1.9.2",
"msconvert_diannv2.0",
"msconvert_diannv2.0.1",
"msconvert_diannv2.0.2",
"thermoraw_diannv1.9.2",
"thermoraw_diannv2.0",
"thermoraw_diannv2.0.1",
"thermoraw_diannv2.0.2"
)]
#4 Intersection off all sets
prots_no_diann1_8 <- Reduce(intersect, list_prots_no_diann1_8)
## Union of set of dia-nn 1.8
prots_diann1_8 <- c(
prots_hsapiens_astral$msconvert_diannv1.8.1,
prots_hsapiens_astral$thermoraw_diannv1.8.1
)
## Proteins identified by all versions but DIA-NN 1.8
prots_unid <- setdiff(prots_no_diann1_8, prots_diann1_8)
## All proteins identified with all versions
all_prots <- unlist(prots_hsapiens_astral) %>%
unique()
## Mart
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
## Convert ID's
all_prots_ids <- getBM(
attributes = c("uniprotswissprot", "entrezgene_id"),
filters = "uniprotswissprot",
values = all_prots,
mart = mart
) %>%
pull("entrezgene_id") %>%
as.character()
interest_ids <- getBM(
attributes = c("uniprotswissprot", "entrezgene_id"),
filters = "uniprotswissprot",
values = prots_unid,
mart = mart
) %>%
pull("entrezgene_id") %>%
as.character()
## GO Enrichment Analysis
### Biological Process
go_bp <- enrichGO(
gene = interest_ids,
universe = all_prots_ids,
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
### Molecular Function
go_mf <- enrichGO(
gene = interest_ids,
universe = all_prots_ids,
OrgDb = org.Hs.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
### Cellular Component
go_cc <- enrichGO(
gene = interest_ids,
universe = all_prots_ids,
OrgDb = org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
## All ontologies
go_all <- enrichGO(
gene = interest_ids,
universe = all_prots_ids,
OrgDb = org.Hs.eg.db,
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
## Plots
barplot(go_mf)
dotplot(go_mf)
sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Mexico_City
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] org.Hs.eg.db_3.18.0 AnnotationDbi_1.64.1 IRanges_2.36.0
## [4] S4Vectors_0.40.2 Biobase_2.62.0 BiocGenerics_0.48.1
## [7] clusterProfiler_4.10.1 biomaRt_2.58.2 diann_1.0.1
## [10] paletteer_1.6.0 viridis_0.6.5 viridisLite_0.4.2
## [13] pheatmap_1.0.12 UpSetR_1.4.0 ggbeeswarm_0.7.2
## [16] fs_1.6.6 magrittr_2.0.3 arrow_19.0.1
## [19] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [22] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [25] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1
## [28] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1 jsonlite_1.8.9
## [4] farver_2.1.2 rmarkdown_2.28 zlibbioc_1.48.2
## [7] vctrs_0.6.5 memoise_2.0.1 RCurl_1.98-1.16
## [10] ggtree_3.10.1 htmltools_0.5.8.1 progress_1.2.3
## [13] curl_5.2.3 gridGraphics_0.5-1 sass_0.4.9
## [16] bslib_0.8.0 plyr_1.8.9 cachem_1.1.0
## [19] igraph_2.1.1 lifecycle_1.0.4 pkgconfig_2.0.3
## [22] gson_0.1.0 Matrix_1.6-5 R6_2.5.1
## [25] fastmap_1.2.0 GenomeInfoDbData_1.2.11 aplot_0.2.3
## [28] digest_0.6.37 enrichplot_1.22.0 colorspace_2.1-1
## [31] rematch2_2.1.2 patchwork_1.3.0 RSQLite_2.3.9
## [34] labeling_0.4.3 filelock_1.0.3 fansi_1.0.6
## [37] timechange_0.3.0 httr_1.4.7 polyclip_1.10-7
## [40] compiler_4.3.3 bit64_4.5.2 withr_3.0.2
## [43] BiocParallel_1.36.0 DBI_1.2.3 highr_0.11
## [46] ggforce_0.4.2 MASS_7.3-60.0.1 rappdirs_0.3.3
## [49] HDO.db_0.99.1 tools_4.3.3 vipor_0.4.7
## [52] scatterpie_0.2.4 ape_5.8 beeswarm_0.4.0
## [55] glue_1.8.0 nlme_3.1-166 GOSemSim_2.28.1
## [58] shadowtext_0.1.4 grid_4.3.3 reshape2_1.4.4
## [61] fgsea_1.28.0 generics_0.1.3 gtable_0.3.5
## [64] tzdb_0.4.0 data.table_1.16.2 hms_1.1.3
## [67] tidygraph_1.3.1 xml2_1.3.6 utf8_1.2.4
## [70] XVector_0.42.0 ggrepel_0.9.6 pillar_1.9.0
## [73] vroom_1.6.5 yulab.utils_0.1.7 splines_4.3.3
## [76] tweenr_2.0.3 treeio_1.26.0 BiocFileCache_2.10.2
## [79] lattice_0.22-6 bit_4.5.0 tidyselect_1.2.1
## [82] GO.db_3.18.0 Biostrings_2.70.3 knitr_1.48
## [85] gridExtra_2.3 xfun_0.48 graphlayouts_1.2.0
## [88] stringi_1.8.4 lazyeval_0.2.2 ggfun_0.1.6
## [91] yaml_2.3.10 evaluate_1.0.1 codetools_0.2-20
## [94] RcppEigen_0.3.4.0.2 ggraph_2.2.1 qvalue_2.34.0
## [97] ggplotify_0.1.2 cli_3.6.3 munsell_0.5.1
## [100] jquerylib_0.1.4 Rcpp_1.0.13 GenomeInfoDb_1.38.8
## [103] dbplyr_2.5.0 png_0.1-8 XML_3.99-0.17
## [106] parallel_4.3.3 assertthat_0.2.1 blob_1.2.4
## [109] prettyunits_1.2.0 DOSE_3.28.2 bitops_1.0-9
## [112] tidytree_0.4.6 scales_1.3.0 crayon_1.5.3
## [115] rlang_1.1.4 cowplot_1.1.3 fastmatch_1.1-4
## [118] KEGGREST_1.42.0